####################################################################################################
# Script to generate Figure 4 in:
# Friends With Text as Data Benefits:
# Assessing and Extending the Use of Automated Text Analysis in Political Science and Political Psychology
# Martijn Schoonvelde, Gijs Schumacher, Bert Bakker
# Contact: mschoonvelde@gmail.com
####################################################################################################


###############
#load libraries
###############

rm(list=ls())
library(quanteda)
library(ggplot2)
library(stringr)
library(stargazer)
library(tidyverse)
library(hrbrthemes)
library(ggrepel)
library(tm)
library(lme4)
library(dplyr)

#set working directory to folder which includes supercorpus.Rdata
#setwd()

load("comb.corpus.RData")
corpus$eucount <- corpus$eupres <- corpus$eusentence <- corpus$eusentences <- corpus$eutermwin <- NULL
corpus$length <- unname(ntoken(corpus$text))

# Add manifesto lef-right scale  as per Lowe et al. (2011) (lr)
# Add manifesto progressive-conservative scale (pc)
# Add manifesto EU positive-negative (EU)
# NB: pc includes items per108 and per 110
###########################################################
source("cmp.R")

corpus$date <- as.Date(corpus$date, format = "%d-%m-%Y")
corpus <- subset(corpus, date != "2099-01-01")
corpus$year <- format(corpus$date, "%Y")
corpus$lr <- NA
corpus$pc <- NA

#link corpus to cmp data, and extract lr, pc, and EU scores
for(i in 1:nrow(corpus)){
  which.line <- subset(cmp.data, cmp.data$party == corpus$manifcode[i])
  if(nrow(which.line) > 0){
    which.line <- which.line[max(which(corpus$date[i] >= which.line$edate), na.rm = TRUE),]
    corpus$lr[i] <- -1*which.line$lr
    corpus$pc[i] <- -1*which.line$pc
  }}

corpus.speaker <- subset(corpus, transl == FALSE)
corpus.speaker <- subset(corpus.speaker, length >= 200)
corpus.speaker <- subset(corpus.speaker, institution == "EP" | institution == "Nat. leader")
corpus.speaker <- na.omit(corpus.speaker)
corpus.speaker <- corpus.speaker %>%
  group_by(speaker) %>%
  summarise(text = paste(text, collapse = " "),
            lr = mean(lr),
            pc = mean(pc)) %>%
  ungroup()

corpus.speaker <- corpus(corpus.speaker)

supercorpus <- corpus(corpus)
supercorpus <- corpus_subset(supercorpus, transl == FALSE)
supercorpus <- corpus_subset(supercorpus, length >= 200)
supercorpus <- corpus_subset(supercorpus, institution == "EP" | institution == "Nat. leader")

#Analysis Sentiment and Wordfish
################################

dtm <- dfm(supercorpus, stem = FALSE, remove_punct = FALSE, remove_numbers = FALSE)
dtm <- dfm_group(dtm, groups = "speaker")
dtm <- dfm_trim(dtm, min_docfreq = 10)

#estimate wordfish model on these speeches
wordfish <- textmodel_wordfish(dtm)

#impute LSD dictionary
dictionary.dtm <- dfm(dtm, dictionary = data_dictionary_LSD2015)
perc.neg.words <- as.numeric((dictionary.dtm[,1] + dictionary.dtm[,3]) / ntoken(dtm))
perc.pos.words <- as.numeric((dictionary.dtm[,2] + dictionary.dtm[,4]) / ntoken(dtm))
data <- data.frame(perc.neg.words, perc.pos.words, docvars(dtm)$speaker)

names(data)[3] <- "speaker"

#collect speech positions
data$position <- wordfish$theta

#negative words
###############

mean.neg.words <- tapply(data[,1],data[,3],mean)
sd.neg.words <- tapply(data[,1],data[,3],sd)
length.neg.words <- tapply(data[,1],data[,3], length)
low.neg.words <- mean.neg.words - 1.96*(sd.neg.words / sqrt(length.neg.words))
hi.neg.words <- mean.neg.words + 1.96*(sd.neg.words / sqrt(length.neg.words))

neg.words <- data.frame(mean.neg.words, low.neg.words, hi.neg.words)

#positive words
###############

mean.pos.words <- tapply(data[,2],data[,3],mean)
sd.pos.words <- tapply(data[,2],data[,3],sd)
length.pos.words <- tapply(data[,2],data[,3], length)
low.pos.words <- mean.pos.words - 1.96*(sd.pos.words / sqrt(length.pos.words))
hi.pos.words <- mean.pos.words + 1.96*(sd.pos.words / sqrt(length.pos.words))

pos.words <- data.frame(mean.pos.words, low.pos.words, hi.pos.words)

#position
#########

position <- tapply(data[,4], data[,3], mean)
data.speaker <- data.frame(neg.words, pos.words, position)

data.speaker <- na.omit(data.speaker)

#plot Figure 4 in the paper
###########################

data.speaker$name <- row.names(data.speaker)

fig <- ggplot(data.speaker, aes(x=position, y=100*mean.neg.words, label = name)) +
  geom_point(shape = 1) +
  geom_smooth(method = "lm", size=0.5, alpha = 1/5, color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust=1, hjust=.5), text = element_text(size=15)) + 
  xlab("wordfish position") +
  ylab("% negative sentiment words") +
  theme(legend.position="none") +
  geom_label_repel(aes(label=ifelse(position > 0.75 | position < -0.75,as.character(name),'')),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

ggsave(fig, file="Fig4b.jpg", height=6, width=9)


fig <- ggplot(data.speaker, aes(x=position, y=100*mean.pos.words)) +
  geom_point(shape = 1) +
  geom_smooth(method = "lm", size=0.5, alpha = 1/5, color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust=1, hjust=.5), text = element_text(size=15)) + 
  xlab("wordfish position") +
  ylab("% positive sentiment words") +
  theme(legend.position="none") + 
  geom_label_repel(aes(label=ifelse(position > 0.75 | position < -0.75,as.character(name),'')),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

ggsave(fig, file="Fig4a.jpg", height=6, width=9)


